import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
data = pd.read_csv('2-game_sales_cleaned.csv')

# NA使用平均值填充异常值
plt.boxplot(data['NA_Sales'], vert=False)
plt.title('2-NA_Sales Distribution ')
plt.show()
#EU使用平均值填充异常值
plt.boxplot(data['EU_Sales'], vert=False)
plt.title('2-EU_Sales Distribution ')
plt.show()
#JP使用平均值填充异常值
plt.boxplot(data['JP_Sales'], vert=False)
plt.title('2-JP_Sales Distribution ')
plt.show()

# 选择用于聚类的特征列
features = ['JP_Sales', 'EU_Sales', 'NA_Sales']
X = data[features]
# 因为销量值的范围是0到2，所以可以使用MinMaxScaler标准化，虽然在这个范围内标准化并不是必须的
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 执行AGNES层次聚类
linked = linkage(X_scaled, 'average')
# 尝试不同的聚类数量并计算轮廓系数
silhouette_avg = []
for i in range(2, 10):  # 假设我们测试2到9个聚类
    clusters = fcluster(linked, t=i, criterion='maxclust')
    score = silhouette_score(X_scaled, clusters)
    silhouette_avg.append(score)
# 找出轮廓系数最大的聚类数
optimal_clusters = silhouette_avg.index(max(silhouette_avg)) + 2
print(f"Optimal number of clusters: {optimal_clusters}")
# 使用最佳聚类数进行最终聚类
final_clusters = fcluster(linked, t=optimal_clusters, criterion='maxclust')

# 使用dendrogram可视化层次聚类
plt.figure(figsize=(10, 7))
dendrogram(linked,
            orientation='top',
            labels=data.index,
            distance_sort='descending',
            show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# 选择两个特征进行二维可视化
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=final_clusters, cmap='viridis', alpha=0.7)
plt.xlabel('Japan Sales (Scaled)')
plt.ylabel('Europe Sales (Scaled)')
plt.title('AGNES Clustering Results')
plt.colorbar(label='Cluster')
plt.show()

from mpl_toolkits.mplot3d import Axes3D
# 创建三维散点图
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_scaled[:, 0], X_scaled[:, 1], X_scaled[:, 2], c=final_clusters, cmap='viridis', alpha=0.7)
ax.set_xlabel('Japan Sales (Scaled)')
ax.set_ylabel('Europe Sales (Scaled)')
ax.set_zlabel('NA Sales (Scaled)')
ax.set_title('3D View of AGNES Clustering Results')
plt.show()